1. Managed Table

CREATE TABLE employee_test(
			name string,
			age int
			);

drop table employee_test;
			
CREATE TABLE employee_test_partition(
			name string,
			age int)
partitioned by (edate string);

insert into table employee_test_partition partition (edate) values("peter", 30, "20161001") ;
insert into table employee_test_partition partition (edate) values("john", 50, "20161003") ;

ALTER TABLE employee_test_partition PARTITION(edate="20161001") RENAME TO PARTITION(edate="20161011");
ALTER TABLE employee_test_partition DROP IF EXISTS PARTITION(edate="20161011");
LOAD DATA INPATH '/user/root/employee_test_partition.txt' INTO TABLE employee_test_partition PARTITION (edate="20161101");	
ALTER TABLE employee_test_partition ADD PARTITION (edate="20171101") location "/user/root/20171101"; (the files are not moved into /apps/user/warehouse for managed table, change the file will immediately reflect in the table);


hadoop fs -mv /apps/hive/warehouse/employee_test_partition/edate=20161001 /apps/hive/warehouse/employee_test_partition/emdate=20161001
hadoop fs -mv /apps/hive/warehouse/employee_test_partition/edate=20161003 /apps/hive/warehouse/employee_test_partition/emdate=20161003
ALTER TABLE employee_test_partition DROP IF EXISTS PARTITION(edate="20161001");
ALTER TABLE employee_test_partition DROP IF EXISTS PARTITION(edate="20161003");
update hive.PARTITION_KEYS set PKEY_NAME = "emdate" where TBL_ID = 18;
ALTER TABLE employee_test_partition ADD PARTITION (emdate="20161001") location "/apps/hive/warehouse/employee_test_partition/emdate=20161001";
ALTER TABLE employee_test_partition ADD PARTITION (emdate="20161003") location "/apps/hive/warehouse/employee_test_partition/emdate=20161003";


CREATE TABLE employee_test(
			name string,
			age int
			)
location "/user/root/20171101"; (the files are not moved into /apps/user/warehouse for managed table, change the file will immediately reflect in the table);			


2. Dynamic Partition

SET hive.exec.dynamic.partition.mode;
SET hive.exec.dynamic.partition=true;

Dynamic partition insert could potentially be a resource hog in that it could generate a large number of partitions in a short time. To get yourself buckled, we define three parameters:

hive.exec.max.dynamic.partitions.pernode (default value being 2000) is the maximum dynamic partitions that can be created by each mapper or reducer. If one mapper or reducer created more than that the threshold, a fatal error will be raised from the mapper/reducer (through counter) and the whole job will be killed.
hive.exec.max.dynamic.partitions (default value being 5000) is the total number of dynamic partitions could be created by one DML. If each mapper/reducer did not exceed the limit but the total number of dynamic partitions does, then an exception is raised at the end of the job before the intermediate data are moved to the final destination.
hive.exec.max.created.files (default value being 5000) is the maximum total number of files created by all mappers and reducers. This is implemented by updating a Hadoop counter by each mapper/reducer whenever a new file is created. If the total number is exceeding hive.exec.max.created.files, a fatal error will be thrown and the job will be killed.



3. External Table

CREATE EXTERNAL TABLE employee_external_test(
			name string,
			age int
)
partitioned by (edate string);			

insert into table employee_external_test partition (edate) values("peter", 30, "20161001") ; (data is written to /apps/user/warehouse since external table doesn't define location)
insert into table employee_external_test partition (edate) select name, age, emdate from employee_test_partition;	
insert into table employee_external_test partition (edate) select * from employee_test_partition;	

UPDATE tablename SET column = value [, column = value…] [WHERE expression]
DELETE FROM tablename [WHERE expression]


hadoop fs -put '/root/TrainingOnHDP/dataset/employee_test_partition.txt' /user/root/employee_test_partition.txt
LOAD DATA INPATH '/user/root/employee_test_partition.txt' INTO TABLE employee_external_test PARTITION (edate="20181101"); (the files are removed even ig it is external table)

hadoop fs -mkdir /apps/hive/warehouse/employee_external_test/edate=20191001 
hadoop fs -put '/root/TrainingOnHDP/dataset/employee_test_partition.txt' /apps/hive/warehouse/employee_external_test/edate=20191001
MSCK REPAIR TABLE employee_external_test;


hadoop fs -mkdir /user/root/data 
hadoop fs -mkdir /user/root/data/edate=20160101 
hadoop fs -put '/root/TrainingOnHDP/dataset/employee_test_partition.txt' /user/root/data/20160101/employee_test_partition.txt


hadoop fs -mkdir /user/root/data/edate=20170101 
hadoop fs -put '/root/TrainingOnHDP/dataset/employee_test.txt' /user/root/data/edate=20170101



CREATE EXTERNAL TABLE employee_external_test(
			name string,
			age int
)
partitioned by (edate string)
location "/user/root/data";

MSCK REPAIR TABLE employee_external_test;

insert into table employee_external_test1 partition (edate) values("peter", 30, "20160109"); 

hadoop fs -put '/root/TrainingOnHDP/dataset/employee_test.txt' /user/root/data/edate=20160101

4. Aggregation and Sampling

4.1 Prepare table and data for demonstration

CREATE TABLE IF NOT EXISTS employee_contract(
name string,
dept_num int,
employee_id int,
salary int,
type string,
start_date date
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

LOAD DATA LOCAL INPATH
'/root/TrainingOnHDP/dataset/employee_contract.txt' 
OVERWRITE INTO TABLE employee_contract;

4.2 The regular aggregations are used as analytic functions

SELECT name, dept_num, salary,
COUNT(*) OVER (PARTITION BY dept_num) AS row_cnt,
SUM(salary) OVER(PARTITION BY dept_num ORDER BY dept_num) AS deptTotal,
SUM(salary) OVER(ORDER BY dept_num) AS runningTotal1,
SUM(salary) OVER(ORDER BY dept_num, name rows unbounded 
preceding) AS runningTotal2
FROM employee_contract
ORDER BY dept_num, name;

4.3 Other analytic functions

SELECT name, dept_num, salary,
RANK() OVER (PARTITION BY dept_num ORDER BY salary) AS rank, 
DENSE_RANK() OVER (PARTITION BY dept_num ORDER BY salary) 
AS dense_rank,
ROW_NUMBER() OVER () AS row_num,
ROUND((CUME_DIST() OVER (PARTITION BY dept_num 
ORDER BY salary)), 1) AS cume_dist,
PERCENT_RANK() OVER(PARTITION BY dept_num 
ORDER BY salary) AS percent_rank,
NTILE(4) OVER(PARTITION BY dept_num ORDER BY salary) AS ntile
FROM employee_contract
ORDER BY dept_num;

SELECT name, dept_num, salary,
LEAD(salary, 2) OVER(PARTITION BY dept_num 
ORDER BY salary) AS lead,
LAG(salary, 2, 0) OVER(PARTITION BY dept_num 
ORDER BY salary) AS lag,
FIRST_VALUE(salary) OVER (PARTITION BY dept_num 
ORDER BY salary) AS first_value,
LAST_VALUE(salary) OVER (PARTITION BY dept_num 
ORDER BY salary) AS last_value_default,
LAST_VALUE(salary) OVER (PARTITION BY dept_num 
ORDER BY salary 
RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
AS last_value
FROM employee_contract ORDER BY dept_num;

SELECT name, dept_num, salary AS sal,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY
name ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) win1,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN 2 PRECEDING AND UNBOUNDED FOLLOWING) win2,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING) win3,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN 1 FOLLOWING AND 2 FOLLOWING) win5,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN CURRENT ROW AND CURRENT ROW) win7,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN CURRENT ROW AND 1 FOLLOWING) win8,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) win9,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) win10,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN UNBOUNDED PRECEDING AND 1 FOLLOWING) win11,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING) win12,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
name ROWS 2 PRECEDING) win13
FROM employee_contract
ORDER BY dept_num, name;

SELECT name, dept_num, salary,
MAX(salary) OVER w1 AS win1,
MAX(salary) OVER w1 AS win2,
MAX(salary) OVER w1 AS win3
FROM employee_contract
ORDER BY dept_num, name
WINDOW
w1 AS (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 
2 PRECEDING AND CURRENT ROW),
w2 AS w3,
w3 AS (PARTITION BY dept_num ORDER BY name ROWS BETWEEN 
1 PRECEDING AND 2 FOLLOWING);

SELECT name, salary, start_year,
MAX(salary) OVER (PARTITION BY dept_num ORDER BY 
start_year RANGE BETWEEN 2 PRECEDING AND CURRENT ROW) win1
FROM
(
  SELECT name, salary, dept_num, 
  YEAR(start_date) AS start_year
  FROM employee_contract
) a;


4.4 Bucket table sampling example

CREATE TABLE employee_id_buckets                         
(
  name string,
  employee_id int,
  work_place ARRAY<string>,
  sex_age STRUCT<sex:string,age:int>,
  skills_score MAP<string,int>,
  depart_title MAP<STRING,ARRAY<STRING>>
)
CLUSTERED BY (employee_id) INTO 2 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
COLLECTION ITEMS TERMINATED BY ','
MAP KEYS TERMINATED BY ':';

set map.reduce.tasks = 2;

set hive.enforce.bucketing = true;

INSERT OVERWRITE TABLE employee_id_buckets SELECT * FROM employee_id;

Run into OutOfMemory error since using Tez engine

https://azure.microsoft.com/en-us/blog/hive-memory-settings-resolve-out-of-memory-errors-using-azure-hdinsight/

the following two memory settings define the container memory for the heap: hive.tez.container.size and hive.tez.java.opts. 
From my experience, the OOM exception does not mean the container size is too small. It means the Java heap size (hive.tez.java.opts) 
is too small. So whenever you see OOM, you can try to increase “hive.tez.java.opts.” If needed you might have to increase “hive.tez.container.size.” 
The “java.opts” should be around 80% of “container.size.”

# SET hive.tez.container.size=10240
SET hive.tez.java.opts=-Xmx1024m

SELECT name FROM employee_id_buckets TABLESAMPLE(BUCKET 1 OUT OF 2 ON rand()) a;

4.5 Block sampling - Sample by rows

SELECT name FROM employee_id_buckets TABLESAMPLE(4 ROWS) a;

4.6 Sample by percentage of data size

SELECT name FROM employee_id_buckets TABLESAMPLE(10 PERCENT) a;

4.7 Sample by data size

SELECT name FROM employee_id_buckets TABLESAMPLE(3M) a;   

# does NOT WORK, NEED TO IMPORT HASH_MD5 LIB FROM BRICK
#select * from employee_id where abs( hash_md5(employee_id) ) % 100 < 10;



6. Hive Integration with other tools
	6.1 Hcatalog
	
	create table drivers
		(driverId int,
		name string,
		ssn bigint,
		location string,
		certified string,
		wageplan string)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY ','
	STORED AS TEXTFILE
	TBLPROPERTIES("skip.header.line.count"="1");
	
	LOAD DATA LOCAL INPATH '/root/TrainingOnHDP/dataset/drivers.csv' OVERWRITE INTO TABLE drivers;
	
	
	create table truck_events
		(driverId int,
		truckId int,
		eventTime string,
		eventType string,
		longitude double,
		latitude double,
		eventKey string,
		correlationId bigint,
		driverName string,
		routeId int,
		routeName string)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
	STORED AS TEXTFILE
	TBLPROPERTIES("skip.header.line.count"="1");
	
	LOAD DATA LOCAL INPATH '/root/TrainingOnHDP/dataset/truck_event_text_partition.csv' OVERWRITE INTO TABLE truck_events;
	
	select a.driverId,a.driverName,a.eventType,b.certified from truck_events a join drivers b ON (a.driverId = b.driverId);
	
	PIG:
	
	a = LOAD 'drivers' using org.apache.hive.hcatalog.pig.HCatLoader();
	b = LOAD 'truck_events' using org.apache.hive.hcatalog.pig.HCatLoader();
	c = join b by driverid, a by driverid;
	dump c;
	
	WebHCat
	
	http://127.0.0.1:50111/templeton/v1/ddl/database/default/table/employee?user.name=hive
	
	
	
	6.2 Oozie
	6.3 HBase
	
		6.3.1 
		CREATE TABLE IF NOT EXISTS pagecounts (projectcode STRING, pagename STRING, pageviews STRING, bytes STRING)
		ROW FORMAT
		DELIMITED FIELDS TERMINATED BY ' '
		LINES TERMINATED BY '\n'
		STORED AS TEXTFILE
		LOCATION '/user/root/pagecounts';
		
		select INPUT__FILE__NAME from pagecounts limit 10;
		
		CREATE VIEW IF NOT EXISTS pgc (rowkey, pageviews, bytes) AS
		SELECT concat_ws('/',
				projectcode,
				concat_ws('/',
				pagename,
				regexp_extract(INPUT__FILE__NAME, 'pagecounts-(\\d{8}-\\d{6})', 1))),
				pageviews, bytes
		FROM pagecounts;
		
		CREATE TABLE IF NOT EXISTS pagecounts_hbase (rowkey STRING, pageviews STRING, bytes STRING)
		STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
		WITH SERDEPROPERTIES ('hbase.columns.mapping' = ':key,0:PAGEVIEWS,0:BYTES')
		TBLPROPERTIES ('hbase.table.name' = 'PAGECOUNTS');
		
		FROM pgc INSERT INTO TABLE pagecounts_hbase SELECT pgc.* WHERE rowkey LIKE 'en/q%' LIMIT 10;
		
		hbase shell
		
		scan 'PAGECOUNTS'
		
		CREATE VIEW "PAGECOUNTS" (pk VARCHAR PRIMARY KEY,
		"0".PAGEVIEWS VARCHAR,
		"0".BYTES VARCHAR)
		
	6.4 Zeppline
	
	http://127.0.0.1:9995/
	
	%hive(default)

	select * from employee
	
	show tables
	
	6.5 Tableau
	6.6 Talend Open Studio
	6.7 Datameer
	6.8 Excel
	6.9 Qlikview
	6.10 Sqoop
	
		sqoop import --hive-import --hive-overwrite --connect jdbc:mysql://localhost/sqoop_test --table stocks --fetch-size 10 --username hip_sqoop_user -P
		sqoop import --hive-import --hive-overwrite --hive-partition-key edate --hive-partition-value "20160101" --target-dir /user/root/stocks/edate=20160101 --connect jdbc:mysql://localhost/sqoop_test --table stocks --fetch-size 10 --username hip_sqoop_user -P
		sqoop import --hive-import --hive-partition-key edate --hive-partition-value "20160102" --target-dir /user/root/stocks/edate=20160102 --connect jdbc:mysql://localhost/sqoop_test --table stocks --fetch-size 10 --username hip_sqoop_user -P
		sqoop import --hive-import --hive-partition-key edate --hive-partition-value "20160103" --target-dir /user/root/stocks/edate=20160103 --connect jdbc:mysql://localhost/sqoop_test --table stocks --fetch-size 10 --username hip_sqoop_user -P

	6.11 MySQL (MySQLHandler and Select AS)
	
	add jar /root/TrainingOnHDP/lib/hive-jdbc-handler-0.8.1-wso2v7.jar;
	
	CREATE EXTERNAL TABLE business
	ROW FORMAT SERDE 'org.wso2.carbon.hadoop.hive.jdbc.storage.JDBCDataSerDe'
	with serdeproperties (
		"escaped" = "true"
	)
	STORED BY 'org.wso2.carbon.hadoop.hive.jdbc.storage.JDBCStorageHandler'
	TBLPROPERTIES (
		"mapred.jdbc.driver.class"="com.mysql.jdbc.Driver",
		"mapred.jdbc.url"="jdbc:mysql://localhost:3306/test",
		"mapred.jdbc.username"="hive",
		"mapred.jdbc.password"="root",
		"mapred.jdbc.input.table.name"="business",
		"mapred.jdbc.output.table.name"="business"	
    );
	

	CREATE EXTERNAL TABLE PhonebrandTable(brand STRING,totalOrders INT, totalQuantity INT)
	STORED BY 'org.wso2.carbon.hadoop.hive.jdbc.storage.JDBCStorageHandler'
	TBLPROPERTIES (
		"mapred.jdbc.driver.class"="com.mysql.jdbc.Driver",
		"mapred.jdbc.url"="jdbc:mysql://localhost:3306/test",
		"mapred.jdbc.username"="root",
		"mapred.jdbc.password"="",
		"hive.jdbc.update.on.duplicate" = "true",
		"hive.jdbc.primary.key.fields" = "brand",
		"hive.jdbc.table.create.query" = "CREATE TABLE brandSummary (brand VARCHAR(100) NOT NULL PRIMARY KEY, totalOrders INT, totalQuantity INT)"
	);	
	
	6.12 Spark
		val sqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
		sqlContext.sql("FROM employee SELECT *").collect().foreach(println)
		sqlContext.sql("FROM employee SELECT count(*)").collect().foreach(println)
		sqlContext.sql("FROM employee_external_test1 SELECT *").collect().foreach(println)
		sqlContext.sql("MSCK REPAIR TABLE employee_external_test1")
	
	6.13 kylin
	
	6.14 Zookeeper
	
	
	
IMPORT	FROM	'/user/root/stocks1'; 

Import	data	to	a	new	table: 

IMPORT	TABLE	stocks_imported	FROM '/user/root/stocks1'; 

Import	data	to	an	external	table,	where	the	LOCATION	property	is	optional: 

IMPORT	EXTERNAL	TABLE	stocks_imported_external	FROM	'/user/root/stocks1'
LOCATION	'/user/root/stocks3'; 

Export	and	import	partitions: 

EXPORT	TABLE	stocks	partition(edate=20160101)	TO	'/user/root/stocks7'; 


jdbc:hive2://>	SHOW	TRANSACTIONS; 

		
	
	
16. Performance utilities: Explanin and Analyze


	EXPLAIN [EXTENDED|DEPENDENCY|AUTHORIZATION] hive_query
	
	EXPLAIN SELECT sex_age.sex, count(*) FROM employee_partitioned WHERE year=2014 GROUP BY sex_age.sex LIMIT 2;


	ANALYZE TABLE employee COMPUTE STATISTICS;

	ANALYZE TABLE employee_partitioned PARTITION(year=2014, month=12) COMPUTE STATISTICS;

	ANALYZE TABLE employee_id COMPUTE STATISTICS FOR COLUMNS employee_id;
	

